import pandas as pd
from itertools import zip_longest
from IPython.display import HTML as html_print
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import re
test = pd.read_csv('../ViralFP_dataset/data/holdout_dataset.csv')
test = test.sort_values(by=['idProtein'])
test
| idProtein | Name | Class | Activation | Name_Fusogenic_Unit | Location_Fusogenic | Sequence_fusogenic | UniProtID | NcbiID | idTaxonomy | ... | idTaxonomy.1 | CommonName | Family | Genre | Species | SubSpecies | NcbiTax | FP corrected DL | Notes DL | seq_vfp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 105 | Envelope glycoprotein | I | binding to receptor | Envelope protein p15E | 466-644 | DPISLTVALMLGGITVGGMARNRNRDCGLLETAQFRQLQMAMHTDI... | P21443 | NaN | 105 | ... | 105 | Feline leukemia virus (isolate CFE-6) | Retroviridae | Gammaretrovirus | Feline leukemia virus | clone CFE-6 | 11922 | NaN | NaN | PISLTVALMLGGITVGGMARN |
| 6 | 146 | Genome polyprotein | II | NaN | Envelope glycoprotein E1 | 192-384 | LEYRNASGLYLLTNDCSNRSIVYEADDVILHLPGCVPCVETDNNNT... | Q81487 | NaN | 146 | ... | 146 | Hepatitis C virus | Flaviviridae | Hepacivirus | Hepacivirus C | isolate Tr Kj | 357355 | NaN | NaN | LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR |
| 5 | 158 | Genome polyprotein | II | NaN | Envelope glycoprotein E1 | 192-383 | LEYRNASGLYTVTNDCSNGSIVYEAGDVILHLPGCIPCVRLNNASK... | Q68801 | NaN | 158 | ... | 158 | Hepatitis C virus | Flaviviridae | Hepacivirus | Hepacivirus C | isolate JK049 | 356417 | NaN | NaN | MVGAATLCSALYVGDLCGALFLVGQGFSWRHR |
| 0 | 192 | Spike glycoprotein S | I | interacting with receptor | Corona S2 | 759-1353 | AITTGYRFTNFEPFTVNSVNDSLEPVGGLYEIQIPSEFTIGNMVEF... | P36334 | NaN | 192 | ... | 192 | Human Coronavirus | Coronaviridae | Betacoronavirus | Betacoronavirus 1 | OC43 | 31631 | NaN | NaN | LAATSASLFPPWTAAAGVPFY |
| 7 | 225 | Envelope glycoprotein gp160 (Env polyprotein) | I | binding to receptor | gp41 - by similarity | 499-843 | AVGMGAVLFGFLGAAGSTMGAAAITLTAQARQLLSGIVQQQSNLLK... | Q9QBZ4 | NaN | 225 | ... | 225 | Human immunodeficiency virus | Retroviridae | Lentivirus | Human immunodeficiency virus 1 | M:F2_MP255C | 388815 | NaN | NaN | AVGMGAVLFGFLGAAGSTMGA |
| 1 | 265 | Envelope glycoprotein gp160 (Env polyprotein) | I | binding to receptor | gp41 - by similarity | 488-832 | AAGLGALFLGFLGDSREHMGAASITLTVQARQLLSGIVQQQNNLLR... | Q9QSQ7 | NaN | 265 | ... | 265 | Human immunodeficiency virus | Retroviridae | Lentivirus | Human immunodeficiency virus 1 | M:F1_VI850 | 388813 | NaN | NaN | AAGLGALFLGFLGDSREHMGA |
| 4 | 574 | Pre-glycoprotein polyprotein GP complex | I | low pH | GP2 | 266-498 | GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAVAKCNVNH... | P07399 | NaN | 574 | ... | 574 | Lymphocytic choriomeningitis virus | Arenaviridae | Mammarenavirus | Lymphocytic choriomeningitis mammarenavirus | (strain WE) | 11627 | GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV | Same as Lassa | GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV |
| 3 | 618 | fusion glycoprotein F0 | I | binding to receptor | F1 | 102-537 | FLGLILGLGAAVTAGVALAKTVQLESEIALIRDAVRNTNEAVVSLT... | P35949 | NaN | 618 | ... | 618 | Murine pneumonia virus (strain 15) (MPV) | Pneumoviridae | Orthopneumovirus | Murine orthopneumovirus | of mice 15 | 296738 | NaN | NaN | FLGLILGLGAAVTAGVALAKT |
| 8 | 779 | G glycoprotein | III | low pH | H2 | NaN | MKCLLYLAFLSIGVNCKFTIVFPHNQKGTWKNVPSNYHYCPSSSDL... | NaN | NaN | 774 | ... | 774 | Vesicular stomatitis virus | Rhabdoviridae | Vesiculovirus | Indiana vesiculovirus | (strain 94GUB Central America) | 434489 | NaN | NaN | [FRWYGPKY CGYATVT] |
| 9 | 804 | Spike glycoprotein | I | binding to receptor | S2 | 686-1273 | SVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSM... | P0DTC2 | NaN | 804 | ... | 804 | SARS-CoV-2 | Coronaviridae | Betacoronavirus | Severe acute respiratory syndrome-related | 2 | 2697049 | NaN | NaN | SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVL... |
10 rows × 26 columns
# run https://dtu.biolib.com/DeepTMHMM on 2022/12/29
tmd = {'192':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
'265': 'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
'105':'SSSSSSSSSSOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIII',
'618':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIII',
'574':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
'158':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMIMMMMMMMMMMOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMIIIIIIIIIMMMMMMMMMMMMMMMMMMMMMOOOOMMMMMMMMMMMMMMI',
'146':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMIIIIIIIIMMMMMMMMMMMMMMMMMMMMMOOOOMMMMMMMMMMMMMMMI',
'225':'SSSSSSSSSSOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
'779':'SSSSSSSSSSSSSSSSOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIII',
'804':'OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'
}
Functions to plot the scores predicted in the sequence.
# get html element
def cstr(s, bold, color='black'):
if s == ' ':
return "<text style=color:#000;padding-left:10px;background-color:{}> </text>".format(color, s)
else:
if bold == 'b':
return "<text style=color:#000;background-color:{}> <b><u>{}</u></b> </text>".format(color, s)
# if tmd == 't':
# return "<text style=color:#000;background-color:{}> <b><u>{}</u></b> </text>".format(color, s)
else:
return "<text style=color:#000;background-color:{}>{} </text>".format(color, s)
# return "<text style=color:#000;padding-left:10px;background-color:{}> <{}> </text> </{}>".format(bold,color,s,bold)
# return "<text style=color:#000;background-color:{}>{} <{}> </text> </{}>".format(color,s,bold,bold)
# print html
def print_color(t, bold_letter):
# for i in range(len(t)):
# ti = t[i][0]
# ci = t[i][1]
# bi = bold_letter[i][1]
display(html_print(''.join([cstr(t[i][0],bold_letter[i][1], color=t[i][1]) for i in range(len(t))])))
# display(html_print(''.join([cstr(ti, color=ci, bi) for ti,ci in t])))
# display(html_print(''.join([bold(ti, bi) for ti,bi in bold_letter])))
# get appropriate color for value
def get_clr(value):
cmap = cm.get_cmap('YlOrRd')
rgba = cmap(value)
# rgb2hex accepts rgb or rgba
col = matplotlib.colors.rgb2hex(rgba)
return col
#check if this is ok
window = 21
def get_scores_seq(seq,file_predict, mode='max'):
list_scores=[]
#for each subseq
for i in range(len(seq)-1):
posi = i
posf = posi+window #window_size
# score = round(float(new_df['class predicted'][i]),4)
if len(file_predict.columns)>2: # tem proba
# score_0 = round(float(new_df['prob_class_0'][i]),4)
# score_1 = round(float(new_df['prob_class_1'][i]),4)
score_column = 'prob_class_1'
else:
score_column = 'class predicted'
# try put all the values in a list of the scores . each letter has a list of values of the scores in which that letter appear
if i+window < len(file_predict):
val = file_predict[score_column][i:posf]
elif i >= len(file_predict):
val = [file_predict[score_column][file_predict.shape[0]-1]]
else:
val = file_predict[score_column][i:]
if mode == 'max':
new_val = max(val) # np.mean
elif mode == 'mean':
new_val = np.mean(val)
list_scores.append(round(float(new_val),4))
return list_scores
def graphic(new_score):
plt.plot(new_score)
plt.show()
def visualize(file_predict, seq, location, mode):
new_score = get_scores_seq(seq,file_predict,mode)
graphic(new_score)
text_colours = []
bold_letter = []
for i in range(len(seq)-2):
text = (seq[i], get_clr(new_score[i]))
text_colours.append(text)
if type(location) == list:
# '[(85, 93), (129, 136)]'' : # bipartid
if i in range(location[0][0], location[0][1]-1):
bold_letter.append((seq[i],'b'))
elif i in range(location[1][0], location[1][1]-1):
bold_letter.append((seq[i],'b'))
else:
bold_letter.append((seq[i],np.nan))
else: # all the others peptides
if i in range(location[0], location[1]-1):
bold_letter.append((seq[i],'b'))
else:
bold_letter.append((seq[i],np.nan))
# location in bold or box around
print_color(text_colours, bold_letter)
def visualize_tmd(tmd_str, sequence_fusogenic):
score_list = []
bold_letter = []
text_colours = []
tmd = []
signal= []
for i in range(len(tmd_str)): #'OSMI'
ch = tmd_str[i]
aa = sequence_fusogenic[i]
if ch == 'M':
membrane = 0.9
tmd.append(aa)
elif ch == 'S':
# its a signal !!!!! just to not change the color
membrane = 0.5 # just a number to get colour
signal.append(aa)
else:
membrane = 0
text = (aa, get_clr(membrane))
text_colours.append(text)
bold_letter.append('none')
score_list.append(membrane)
print_color(text_colours, bold_letter)
print('tmd', str(tmd))
print('signal', str(signal))
path_results = 'results/'
def output(model_name_specific):
for idProtein in test['idProtein']:
print('################################')
print(idProtein)
file_predict = pd.read_csv(path_results+model_name_specific + '/TESTSEQ{}.csv'.format(idProtein))
test_id = test.loc[test['idProtein'] == idProtein]
true_subseq = str(test_id['seq_vfp'].values[0])
fusogenic = str(test_id['Sequence_fusogenic'].values[0])
print(test_id['Name'].values[0])
tmd_predict = tmd[str(idProtein)]
# index of true vfp
print(true_subseq)
if idProtein == 779: # éo bipartido
true_subseq1 = 'FRWYGPKY'
true_subseq2 = 'CGYATVT'
index_of_vfp1 = fusogenic.index(true_subseq1)
index_of_vfp2 = fusogenic.index(true_subseq2)
location_of_vfp1 = (index_of_vfp1, index_of_vfp1 + len(true_subseq1))
location_of_vfp2 = (index_of_vfp2, index_of_vfp2 + len(true_subseq2))
location_of_vfp = [location_of_vfp1, location_of_vfp2]
else:
index_of_vfp = fusogenic.index(true_subseq)
location_of_vfp = (index_of_vfp, index_of_vfp + len(true_subseq))
print('vfp model prediction')
# plot the vfp model predictions
mode = 'max' # or 'mean' value of each letter .
#the score although is by position represents the score for that position + 21. decide how to score each position.
visualize(file_predict, seq = fusogenic, location = location_of_vfp, mode=mode)
print('tmd prediction by DeepTMHMM')
visualize_tmd(tmd_predict,fusogenic)
# ### relevant info
score_of_true_vfp = file_predict.loc[index_of_vfp]['class predicted']
print('vfp predicted:', score_of_true_vfp )
if len(file_predict.columns)>2 : # tem proba
score_of_true_vfp_0 = file_predict.loc[index_of_vfp]['prob_class_0']
score_of_true_vfp_1 = file_predict.loc[index_of_vfp]['prob_class_1']
print('score vfp predicted:', score_of_true_vfp_1)
df_sorted = file_predict.sort_values(by='prob_class_1', ascending=False)
print('\n max score predicted')
print('\n top scores')
print(df_sorted[:5])
print('index of true', index_of_vfp)
# print(file_predict.loc[file_predict['class predicted']==1])
True Viral Fusion Peptides are underlined. Red scores are the highest values.
# model_name_specific = 'phys_ml/All_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_svc' # 3 right. 1 wrong. 1 sars
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_svc_prob_weights' # 3 right. 1 wrong. 1 sars very close t just svc
# model_name_specific = 'phys_ml/All_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_rf' # predicts 2
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_rf_weights' # gets 2
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_gboosting' # gets 2
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_gboosting' # it gets 2
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_rf' # it gets 2
# model_name_specific = 'phys_ml/all_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_svc' # predits 3 well. 1 different. and sarscov
# model_name_specific = 'phys_ml/Half_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_rf'
# really good! 6 right. 1 right with extra wrong. 3 for sarscov
# model_name_specific = 'phys_ml/Half_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_svc' # predicts a lot of things
# model_name_specific = 'phys_ml/half_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_gboosting' # 5 right. 1 wrong. 4 for Sars
# model_name_specific = 'phys_ml/half_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_rf' # 5 right. 1 right with plus wrng 1 wrong 3 sarscov
# model_name_specific = 'phys_ml/half_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_svc' # predits everything
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_svc' # 5 right ( 2 have another prediction) wrong. 3 sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_MUTUALSEL_10groupedKFOLD_cluster80_rf' # 3 right. 1 sarscov the rest nothing.
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_gboosting' # 3 correct
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_gnb' # 4 right 2 wrong 3 for sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_knn' # predicts evrything
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_linear_svc' # 7 right 1wrong 4for sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_lr' # 5 right 1 wrong 4 for sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_rf' # 3 right 1 sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_sgd' # predicts 2 right. 1 wrong. 1 sarscov
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_svc' # nice to SEE
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NONE_10groupedKFOLD_cluster80_linear_svc_try3'
# model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_linear_svc'
model_name_specific = 'phys_ml/third_Window21_gap1_PHYSALL_NOFSEL_10groupedKFOLD_cluster80_linear_svc' # 7 right 1wrong 4for sarscov
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F'] signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M'] vfp predicted: 1 index of true 1 ################################ 146 Genome polyprotein LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 0 index of true 74 ################################ 158 Genome polyprotein MVGAATLCSALYVGDLCGALFLVGQGFSWRHR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 0 index of true 73 ################################ 192 Spike glycoprotein S LAATSASLFPPWTAAAGVPFY vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I'] signal [] vfp predicted: 0 index of true 204 ################################ 225 Envelope glycoprotein gp160 (Env polyprotein) AVGMGAVLFGFLGAAGSTMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L'] signal [] vfp predicted: 1 index of true 0 ################################ 574 Pre-glycoprotein polyprotein GP complex GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S'] signal [] vfp predicted: 1 index of true 0 ################################ 618 fusion glycoprotein F0 FLGLILGLGAAVTAGVALAKT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y'] signal [] vfp predicted: 1 index of true 0 ################################ 779 G glycoprotein [FRWYGPKY CGYATVT] vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L'] signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C'] vfp predicted: 0 index of true 0 ################################ 804 Spike glycoprotein SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L'] signal [] vfp predicted: 0 index of true 130
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinal' # deu sempre as primeiras sequencias
# model_name_specific = 'one_hot_dl/third_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinal' # dá tudo
# model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinal' # pessimo. tb dá tudo
# model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinalV2' # pessimo. tb dá tudo
# model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinalV2_cwei' #pessimo
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_bayesianFinalV2_cwei' #pessimo
# model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_att_bayesian_cwei'
# model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cwei' # melhor mas horrivel na mesma
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cwei' # melhor mas horrivel na mesma
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cweiV2' # melhor mas pessimo
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cweiV3'
# model_name_specific = 'one_hot_dl/all_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cweiV4'
model_name_specific = 'one_hot_dl/half_Window21_gap1_ENCODING_OHE_DL_10groupedKFOLD_cluster80_LSTM_cweiV3'
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F']
signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M']
vfp predicted: 1.0
score vfp predicted: 0.6083
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 1
################################
146
Genome polyprotein
LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6123
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 74
################################
158
Genome polyprotein
MVGAATLCSALYVGDLCGALFLVGQGFSWRHR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6263
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 73
################################
192
Spike glycoprotein S
LAATSASLFPPWTAAAGVPFY
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I']
signal []
vfp predicted: 1.0
score vfp predicted: 0.5333
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 204
################################
225
Envelope glycoprotein gp160 (Env polyprotein)
AVGMGAVLFGFLGAAGSTMGA
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I']
signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G']
vfp predicted: 1.0
score vfp predicted: 0.6407
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 0
################################
265
Envelope glycoprotein gp160 (Env polyprotein)
AAGLGALFLGFLGDSREHMGA
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6407
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 0
################################
574
Pre-glycoprotein polyprotein GP complex
GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6407
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 0
################################
618
fusion glycoprotein F0
FLGLILGLGAAVTAGVALAKT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6407
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 0
################################
779
G glycoprotein
[FRWYGPKY CGYATVT]
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L']
signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C']
vfp predicted: 1.0
score vfp predicted: 0.6407
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 0
################################
804
Spike glycoprotein
SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L']
signal []
vfp predicted: 1.0
score vfp predicted: 0.6244
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
91 91 1 0.3298 0.6702
118 118 1 0.3307 0.6693
104 104 1 0.3322 0.6678
113 113 1 0.3328 0.6672
101 101 1 0.3334 0.6666
index of true 130
# Method 3 . E method in each protein gets 100 dim.
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc' #
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc_m3' # 5 right. 1wrong . 1SARSCOV
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_rf_m3' # gets 2. doesnt predict anything else
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_gboosting' # gets 3 . 2 wrong. 1 sarscov
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_knn' # 4 correct 1 half way 2 wrong 1 sarscov
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_gnb' # predicts more than 1 subsequence. 7 right. but always with others
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_rf' # 4 correct3 3 corrected with more predictions. 2 Sarscov
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc' # predicts more than 1 sequence. 3 right. 2 wrong. some predicts too much
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_knn' # a lot of stuff
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_gnb' # Predicts more than one sequence
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_rf' # gets 5. 1 wrong 2 for sarscov
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc' # gets 3 and 2 with corrected but more predictions. 3 sarscov
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_gnb' # gets2. plus 5 where predicts other things. 2 for sarscov
# model_name_specific = 'we_we_ml/third_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_knn' # predicts everything
##### method 1 each sequence is 19*100
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc_m1' # predicts 4. nothing more.
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_rf' # 2 correct 1 wrong. Does not predict anything else
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_gnb' # 3 correct. 3 correct with extra seq. 1 sarscov
# model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_knn' # predicts everything
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_rf' # 6 right predicts everything with 0.4. But it gets right a lot
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_svc' # predicts everything
# model_name_specific = 'we_ml/half_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_gnb' # predicts very big subsequences
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_rf' # predicts a lot but not above 0.5. 4 right 1 wrong. 1 sarscov
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_svc' # predicts a lot of subsequences
# model_name_specific = 'we_ml/third_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_gnb' # predicts a lot but not terrible
model_name_specific = 'we_ml/all_Window21_gap1_WEPROTVEC_10groupedKFOLD_cluster80_svc_m3' # 5 right. 1wrong . 1SARSCOV
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F'] signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M'] vfp predicted: 0 index of true 1 ################################ 146 Genome polyprotein LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 0 index of true 74 ################################ 158 Genome polyprotein MVGAATLCSALYVGDLCGALFLVGQGFSWRHR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 1 index of true 73 ################################ 192 Spike glycoprotein S LAATSASLFPPWTAAAGVPFY vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I'] signal [] vfp predicted: 0 index of true 204 ################################ 225 Envelope glycoprotein gp160 (Env polyprotein) AVGMGAVLFGFLGAAGSTMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L'] signal [] vfp predicted: 1 index of true 0 ################################ 574 Pre-glycoprotein polyprotein GP complex GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S'] signal [] vfp predicted: 1 index of true 0 ################################ 618 fusion glycoprotein F0 FLGLILGLGAAVTAGVALAKT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y'] signal [] vfp predicted: 1 index of true 0 ################################ 779 G glycoprotein [FRWYGPKY CGYATVT] vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L'] signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C'] vfp predicted: 0 index of true 0 ################################ 804 Spike glycoprotein SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L'] signal [] vfp predicted: 0 index of true 130
# model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm1' # gets 4 correct + 2. predicts more sequences
model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm2' # 5 correct. + 1 with other seq. 1 wrong good
# model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm3' # not bad. wosrt than lstm2 and 1
# model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm4' # 4 correct. 2 correct but other sequence predicted.
# model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm5' # # really good check
# model_name_specific = 'we_dl/all_Window21_gap1_WEPROTVEC_method1_10groupedKFOLD_cluster80_lstm6' # its worst . but notbad
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F'] signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M'] vfp predicted: 1.0 score vfp predicted: 0.695 max score predicted top scores Unnamed: 0 class predicted prob_class_0 prob_class_1 1 1 1 0.3050 0.6950 0 0 1 0.4084 0.5916 2 2 0 0.8265 0.1735 3 3 0 0.9470 0.0530 4 4 0 0.9763 0.0237 index of true 1 ################################ 146 Genome polyprotein LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 0.0
score vfp predicted: 0.0115
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
18 18 0 0.9642 0.0358
17 17 0 0.9680 0.0320
150 150 0 0.9695 0.0305
19 19 0 0.9721 0.0279
149 149 0 0.9743 0.0257
index of true 74
################################
158
Genome polyprotein
MVGAATLCSALYVGDLCGALFLVGQGFSWRHR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 1.0
score vfp predicted: 1.0
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
70 70 1 0.0 1.0
71 71 1 0.0 1.0
72 72 1 0.0 1.0
73 73 1 0.0 1.0
74 74 1 0.0 1.0
index of true 73
################################
192
Spike glycoprotein S
LAATSASLFPPWTAAAGVPFY
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I']
signal []
vfp predicted: 0.0
score vfp predicted: 0.0415
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
142 142 1 0.0102 0.9898
143 143 1 0.0114 0.9886
144 144 1 0.0321 0.9679
141 141 1 0.0417 0.9583
145 145 1 0.0501 0.9499
index of true 204
################################
225
Envelope glycoprotein gp160 (Env polyprotein)
AVGMGAVLFGFLGAAGSTMGA
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1.0 score vfp predicted: 1.0 max score predicted top scores Unnamed: 0 class predicted prob_class_0 prob_class_1 0 0 1 0.0 1.0 1 1 1 0.0 1.0 2 2 1 0.0 1.0 3 3 1 0.0 1.0 4 4 1 0.0 1.0 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L']
signal []
vfp predicted: 0.0
score vfp predicted: 0.0904
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
3 3 1 0.3470 0.6530
4 4 1 0.3936 0.6064
2 2 1 0.4733 0.5267
1 1 0 0.6264 0.3736
13 13 0 0.8008 0.1992
index of true 0
################################
574
Pre-glycoprotein polyprotein GP complex
GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S'] signal [] vfp predicted: 1.0 score vfp predicted: 1.0 max score predicted top scores Unnamed: 0 class predicted prob_class_0 prob_class_1 0 0 1 0.0000 1.0000 1 1 1 0.0000 1.0000 2 2 1 0.0001 0.9999 3 3 1 0.0213 0.9787 4 4 0 0.6285 0.3715 index of true 0 ################################ 618 fusion glycoprotein F0 FLGLILGLGAAVTAGVALAKT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y']
signal []
vfp predicted: 1.0
score vfp predicted: 1.0
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
0 0 1 0.0000 1.0000
1 1 1 0.0018 0.9982
2 2 1 0.0256 0.9744
113 113 1 0.0421 0.9579
3 3 1 0.1423 0.8577
index of true 0
################################
779
G glycoprotein
[FRWYGPKY CGYATVT]
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L']
signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C']
vfp predicted: 0.0
score vfp predicted: 0.0009
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
112 112 0 0.9217 0.0783
113 113 0 0.9371 0.0629
111 111 0 0.9475 0.0525
291 291 0 0.9611 0.0389
114 114 0 0.9639 0.0361
index of true 0
################################
804
Spike glycoprotein
SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L']
signal []
vfp predicted: 0.0
score vfp predicted: 0.0435
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
105 105 1 0.0001 0.9999
102 102 1 0.0001 0.9999
104 104 1 0.0001 0.9999
103 103 1 0.0001 0.9999
106 106 1 0.0004 0.9996
index of true 130
# get the representations and feed to a model pretrained.esm2_t33_650M_UR50D (didnt changed the string)
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts 3
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' # does not predict anything. just 2 (correct)
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb' # bad
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts a lot of things
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' #
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb'
# model_name_specific ='esm_ml/half_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' # predicts a lot
# model_name_specific = 'esm_ml/half_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb' # predicts always 2 seqs.
# # smaller model
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts 5 correct. the others not predictsanything
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' # doesnot predict anything
# model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb' # predicts a lot of things
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts a lot of things
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' # bad
# model_name_specific = 'esm_ml/third_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb' # predicts in excess
# model_name_specific = 'esm_ml/half_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts to much
# model_name_specific = 'esm_ml/half_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_rf' # predicts a lot
# model_name_specific = 'esm_ml/half_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_gnb' # not horrible
model_name_specific = 'esm_ml/all_Window21_gap1_TRANSFORMER_T68M_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' # predicts 5 correct. the others not predictsanything
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F'] signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M'] vfp predicted: 1 index of true 1 ################################ 146 Genome polyprotein LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 0 index of true 74 ################################ 158 Genome polyprotein MVGAATLCSALYVGDLCGALFLVGQGFSWRHR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D'] signal [] vfp predicted: 0 index of true 73 ################################ 192 Spike glycoprotein S LAATSASLFPPWTAAAGVPFY vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I'] signal [] vfp predicted: 0 index of true 204 ################################ 225 Envelope glycoprotein gp160 (Env polyprotein) AVGMGAVLFGFLGAAGSTMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L'] signal [] vfp predicted: 1 index of true 0 ################################ 574 Pre-glycoprotein polyprotein GP complex GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S'] signal [] vfp predicted: 0 index of true 0 ################################ 618 fusion glycoprotein F0 FLGLILGLGAAVTAGVALAKT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y'] signal [] vfp predicted: 1 index of true 0 ################################ 779 G glycoprotein [FRWYGPKY CGYATVT] vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L'] signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C'] vfp predicted: 0 index of true 0 ################################ 804 Spike glycoprotein SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L'] signal [] vfp predicted: 0 index of true 130
# transformer small ES2bT68M + LSTM
# seq representations
# model_name_specific = 'esm2b_dl/all_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # predicts a lot/ everything
# model_name_specific = 'esm2b_dl/third_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # better but predicts a lot
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # better but predicts a lot
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_facebook_esm2_t6_8M_UR50D_class_class_10groupedKFOLD_cluster80_svc' #bad. predicts a lot of things.
# # contactrepresentations
# model_name_specific = 'esm_dl/all_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_contact_class_10groupedKFOLD_cluster80_bilstm1' # predicts everything .did not run CV scores
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_contact_class_10groupedKFOLD_cluster80_bilstm1' # predicts entire seq
# model_name_specific = 'esm2b_dl/third_Window21_gap1_TRANSFORMER_facebook_T68M_esm2_t6_8M_UR50D_contact_class_10groupedKFOLD_cluster80_bilstm1' # predicts entire seqdid not run CV scores
# bigger transformer ES2bT33 + LSTM
# model_name_specific = 'esm2b_dl/all_Window21_gap1_TRANSFORMER_T33_facebook_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # very bad now is empty. so accidentaly deleted
model_name_specific = 'esm2b_dl/third_Window21_gap1_TRANSFORMER_T33_facebook_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # predicts a lot of things
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_T33_facebook_esm2_t6_8M_UR50D_token_class_10groupedKFOLD_cluster80_bilstm1' # predicts a lot of things
# didnt pursue further
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F']
signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M']
vfp predicted: 1.0
score vfp predicted: 0.7169
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
76 76 1 0.0043 0.9957
81 81 1 0.0109 0.9891
75 75 1 0.0123 0.9877
78 78 1 0.0146 0.9854
49 49 1 0.0156 0.9844
index of true 1
################################
146
Genome polyprotein
LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 0.0
score vfp predicted: 0.3481
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
149 149 1 0.0060 0.9940
150 150 1 0.0062 0.9938
81 81 1 0.0145 0.9855
147 147 1 0.0204 0.9796
151 151 1 0.0477 0.9523
index of true 74
################################
158
Genome polyprotein
MVGAATLCSALYVGDLCGALFLVGQGFSWRHR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 1.0
score vfp predicted: 0.9998
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
71 71 1 0.0001 0.9999
70 70 1 0.0001 0.9999
74 74 1 0.0001 0.9999
76 76 1 0.0001 0.9999
77 77 1 0.0001 0.9999
index of true 73
################################
192
Spike glycoprotein S
LAATSASLFPPWTAAAGVPFY
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I']
signal []
vfp predicted: 0.0
score vfp predicted: 0.1702
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
565 565 1 0.0000 1.0000
567 567 1 0.0001 0.9999
566 566 1 0.0002 0.9998
60 60 1 0.0002 0.9998
556 556 1 0.0002 0.9998
index of true 204
################################
225
Envelope glycoprotein gp160 (Env polyprotein)
AVGMGAVLFGFLGAAGSTMGA
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1.0 score vfp predicted: 0.9998 max score predicted top scores Unnamed: 0 class predicted prob_class_0 prob_class_1 3 3 1 0.0001 0.9999 4 4 1 0.0001 0.9999 1 1 1 0.0001 0.9999 0 0 1 0.0002 0.9998 2 2 1 0.0002 0.9998 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L']
signal []
vfp predicted: 0.0
score vfp predicted: 0.1791
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
20 20 1 0.0003 0.9997
31 31 1 0.0083 0.9917
21 21 1 0.0136 0.9864
22 22 1 0.0675 0.9325
19 19 1 0.0705 0.9295
index of true 0
################################
574
Pre-glycoprotein polyprotein GP complex
GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S']
signal []
vfp predicted: 1.0
score vfp predicted: 0.9992
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
0 0 1 0.0008 0.9992
1 1 1 0.0010 0.9990
2 2 1 0.0917 0.9083
61 61 1 0.1069 0.8931
60 60 1 0.1371 0.8629
index of true 0
################################
618
fusion glycoprotein F0
FLGLILGLGAAVTAGVALAKT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y']
signal []
vfp predicted: 1.0
score vfp predicted: 0.9912
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
211 211 1 0.0017 0.9983
192 192 1 0.0018 0.9982
210 210 1 0.0023 0.9977
212 212 1 0.0042 0.9958
0 0 1 0.0088 0.9912
index of true 0
################################
779
G glycoprotein
[FRWYGPKY CGYATVT]
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L']
signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C']
vfp predicted: 0.0
score vfp predicted: 0.007
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
193 193 1 0.1084 0.8916
337 337 1 0.2248 0.7752
448 448 1 0.2288 0.7712
426 426 1 0.2829 0.7171
199 199 1 0.3200 0.6800
index of true 0
################################
804
Spike glycoprotein
SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L']
signal []
vfp predicted: 1.0
score vfp predicted: 0.5252
max score predicted
top scores
Unnamed: 0 class predicted prob_class_0 prob_class_1
241 241 1 0.0002 0.9998
242 242 1 0.0005 0.9995
240 240 1 0.0006 0.9994
320 320 1 0.0007 0.9993
188 188 1 0.0014 0.9986
index of true 130
# transformers fine tune and predict (no model no hyper changed)
# the smallest T6 8M
# model_name_specific = 'esm2b_dl/all_Window21_gap1_TRANSFORMER_esm2_t6_8M_UR50D_finetune_class_10groupedKFOLD_cluster80' # predicts 2 correct. do not predict anything else
model_name_specific = 'esm2b_dl/third_Window21_gap1_TRANSFORMER_esm2_t6_8M_UR50D_finetune_class_10groupedKFOLD_cluster80' # better. predicts 4 correct. 1 TMD
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_esm2_t6_8M_UR50D_finetune_class_10groupedKFOLD_cluster80' # it gets correct. but predicts a lot sequences
# model_name_specific = 'esm2b_dl/all_Window21_gap1_TRANSFORMER_facebook/esm2_t12_35M_UR50D_finetune_class_10groupedKFOLD_cluster80'# 4correct . then predicts anything. 1 sarscov
# model_name_specific = 'esm2b_dl/third_Window21_gap1_TRANSFORMER_facebook/third_esm2_t12_35M_UR50D_finetune_class_10groupedKFOLD_cluster80' # 4 CORRECT. 1 wrong (TMD) 2 sarscov
# model_name_specific = 'esm2b_dl/half_Window21_gap1_TRANSFORMER_facebook/esm2_t12_35M_UR50D_finetune_class_10groupedKFOLD_cluster80' # predicts everything/to much
output(model_name_specific)
################################ 105 Envelope glycoprotein PISLTVALMLGGITVGGMARN vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'I', 'L', 'L', 'L', 'I', 'L', 'L', 'F'] signal ['D', 'P', 'I', 'S', 'L', 'T', 'V', 'A', 'L', 'M'] vfp predicted: 1 score vfp predicted: 0.5110286474227905 max score predicted top scores Unnamed: 0 label score class predicted prob_class_0 prob_class_1 5 5 LABEL_1 0.663415 1 0.336585 0.663415 7 7 LABEL_1 0.652628 1 0.347372 0.652628 4 4 LABEL_1 0.608829 1 0.391171 0.608829 3 3 LABEL_1 0.578003 1 0.421997 0.578003 6 6 LABEL_1 0.544576 1 0.455424 0.544576 index of true 1 ################################ 146 Genome polyprotein LVAPPTLCSALYVEDAFGAVSLVGQAFTFRPR vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'I', 'G', 'L', 'V', 'I', 'T', 'F', 'F', 'D', 'L', 'V', 'V', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'L', 'A', 'Y', 'F', 'S', 'W', 'A', 'K', 'V', 'V', 'I', 'V', 'L', 'I', 'M', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 0
score vfp predicted: 0.0710342526435852
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
159 159 LABEL_0 0.505556 0 0.505556
156 156 LABEL_0 0.553455 0 0.553455
158 158 LABEL_0 0.597904 0 0.597904
89 89 LABEL_0 0.600529 0 0.600529
155 155 LABEL_0 0.684759 0 0.684759
prob_class_1
159 0.494444
156 0.446545
158 0.402096
89 0.399471
155 0.315241
index of true 74
################################
158
Genome polyprotein
MVGAATLCSALYVGDLCGALFLVGQGFSWRHR
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['M', 'M', 'V', 'G', 'A', 'A', 'T', 'L', 'C', 'S', 'A', 'L', 'Y', 'V', 'G', 'D', 'L', 'G', 'A', 'L', 'F', 'L', 'V', 'G', 'Q', 'G', 'F', 'A', 'W', 'D', 'M', 'M', 'M', 'N', 'W', 'S', 'P', 'A', 'M', 'T', 'L', 'I', 'V', 'M', 'F', 'D', 'L', 'V', 'I', 'G', 'A', 'H', 'W', 'G', 'V', 'M', 'A', 'G', 'V', 'A', 'Y', 'Y', 'S', 'M', 'A', 'K', 'V', 'F', 'L', 'V', 'L', 'C', 'L', 'F', 'S', 'G', 'V', 'D']
signal []
vfp predicted: 0
score vfp predicted: 0.1068747639656066
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
71 71 LABEL_1 0.712424 1 0.287576
158 158 LABEL_0 0.677653 0 0.677653
155 155 LABEL_0 0.738579 0 0.738579
78 78 LABEL_0 0.767368 0 0.767368
70 70 LABEL_0 0.810682 0 0.810682
prob_class_1
71 0.712424
158 0.322347
155 0.261421
78 0.232632
70 0.189318
index of true 73
################################
192
Spike glycoprotein S
LAATSASLFPPWTAAAGVPFY
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'V', 'W', 'L', 'L', 'I', 'C', 'L', 'A', 'G', 'V', 'A', 'M', 'L', 'V', 'L', 'L', 'F', 'F', 'I']
signal []
vfp predicted: 0
score vfp predicted: 0.0810089707374572
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
561 561 LABEL_1 0.853126 1 0.146874
563 563 LABEL_1 0.841447 1 0.158553
562 562 LABEL_1 0.840442 1 0.159558
559 559 LABEL_1 0.840439 1 0.159561
560 560 LABEL_1 0.838301 1 0.161699
prob_class_1
561 0.853126
563 0.841447
562 0.840442
559 0.840439
560 0.838301
index of true 204
################################
225
Envelope glycoprotein gp160 (Env polyprotein)
AVGMGAVLFGFLGAAGSTMGA
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I'] signal ['A', 'V', 'G', 'M', 'G', 'A', 'V', 'L', 'F', 'G'] vfp predicted: 1 score vfp predicted: 0.9138033390045166 max score predicted top scores Unnamed: 0 label score class predicted prob_class_0 prob_class_1 0 0 LABEL_1 0.913803 1 0.086197 0.913803 1 1 LABEL_1 0.913337 1 0.086663 0.913337 4 4 LABEL_1 0.911635 1 0.088365 0.911635 2 2 LABEL_1 0.911470 1 0.088530 0.911470 7 7 LABEL_1 0.906559 1 0.093441 0.906559 index of true 0 ################################ 265 Envelope glycoprotein gp160 (Env polyprotein) AAGLGALFLGFLGDSREHMGA vfp model prediction
tmd prediction by DeepTMHMM
tmd ['I', 'K', 'I', 'F', 'I', 'M', 'I', 'V', 'G', 'G', 'L', 'I', 'G', 'L', 'R', 'I', 'V', 'F', 'A', 'V', 'L'] signal [] vfp predicted: 1 score vfp predicted: 0.8537814021110535 max score predicted top scores Unnamed: 0 label score class predicted prob_class_0 prob_class_1 1 1 LABEL_1 0.862039 1 0.137961 0.862039 2 2 LABEL_1 0.857080 1 0.142920 0.857080 0 0 LABEL_1 0.853781 1 0.146219 0.853781 8 8 LABEL_0 0.828593 0 0.828593 0.171407 9 9 LABEL_0 0.832909 0 0.832909 0.167091 index of true 0 ################################ 574 Pre-glycoprotein polyprotein GP complex GTFTWTLSDSSGVENPGGYCLTKWMILAAELKCFGNTAV vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'M', 'D', 'L', 'L', 'M', 'F', 'S', 'T', 'S']
signal []
vfp predicted: 0
score vfp predicted: 0.2996881604194641
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
32 32 LABEL_0 0.665663 0 0.665663
0 0 LABEL_0 0.700312 0 0.700312
1 1 LABEL_0 0.751045 0 0.751045
2 2 LABEL_0 0.821011 0 0.821011
200 200 LABEL_0 0.832837 0 0.832837
prob_class_1
32 0.334337
0 0.299688
1 0.248955
2 0.178989
200 0.167163
index of true 0
################################
618
fusion glycoprotein F0
FLGLILGLGAAVTAGVALAKT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['L', 'L', 'F', 'V', 'V', 'M', 'L', 'I', 'I', 'I', 'M', 'A', 'V', 'I', 'G', 'F', 'I', 'L', 'Y'] signal [] vfp predicted: 1 score vfp predicted: 0.8360065817832947 max score predicted top scores Unnamed: 0 label score class predicted prob_class_0 prob_class_1 2 2 LABEL_1 0.869171 1 0.130829 0.869171 1 1 LABEL_1 0.857347 1 0.142653 0.857347 0 0 LABEL_1 0.836007 1 0.163993 0.836007 6 6 LABEL_1 0.806049 1 0.193951 0.806049 5 5 LABEL_1 0.805186 1 0.194814 0.805186 index of true 0 ################################ 779 G glycoprotein [FRWYGPKY CGYATVT] vfp model prediction
tmd prediction by DeepTMHMM
tmd ['F', 'F', 'F', 'I', 'I', 'G', 'L', 'I', 'I', 'G', 'L', 'F', 'L', 'V', 'L', 'R', 'V', 'G', 'I', 'Y', 'L']
signal ['M', 'K', 'C', 'L', 'L', 'Y', 'L', 'A', 'F', 'L', 'S', 'I', 'G', 'V', 'N', 'C']
vfp predicted: 0
score vfp predicted: 0.0702841281890869
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
117 117 LABEL_0 0.803253 0 0.803253
118 118 LABEL_0 0.861441 0 0.861441
116 116 LABEL_0 0.876460 0 0.876460
372 372 LABEL_0 0.877100 0 0.877100
70 70 LABEL_0 0.878088 0 0.878088
prob_class_1
117 0.196747
118 0.138559
116 0.123540
372 0.122900
70 0.121912
index of true 0
################################
804
Spike glycoprotein
SFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT
vfp model prediction
tmd prediction by DeepTMHMM
tmd ['W', 'Y', 'I', 'W', 'L', 'G', 'F', 'I', 'A', 'G', 'L', 'I', 'A', 'I', 'V', 'M', 'V', 'T', 'I', 'M', 'L']
signal []
vfp predicted: 0
score vfp predicted: 0.0682223439216613
max score predicted
top scores
Unnamed: 0 label score class predicted prob_class_0 \
191 191 LABEL_1 0.857407 1 0.142593
189 189 LABEL_1 0.855446 1 0.144554
202 202 LABEL_1 0.788190 1 0.211810
190 190 LABEL_1 0.779400 1 0.220600
554 554 LABEL_1 0.772625 1 0.227375
prob_class_1
191 0.857407
189 0.855446
202 0.788190
190 0.779400
554 0.772625
index of true 130
£ get sequences scores 21 aa front get vfpep more correct